library(ape)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:base':
##
## strsplit
library(seqinr)
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
## The following objects are masked from 'package:ape':
##
## as.alignment, consensus
library(ggtree)
## ggtree v3.10.0 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
##
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
##
## Guangchuang Yu. Using ggtree to visualize data on tree-like structures.
## Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:Biostrings':
##
## collapse
## The following object is masked from 'package:IRanges':
##
## collapse
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following object is masked from 'package:ape':
##
## rotate
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
## ggmsa v1.8.0 Document: http://yulab-smu.top/ggmsa/
##
## If you use ggmsa in published research, please cite:
## L Zhou, T Feng, S Xu, F Gao, TT Lam, Q Wang, T Wu, H Huang, L Zhan, L Li, Y Guan, Z Dai*, G Yu* ggmsa: a visual exploration tool for multiple sequence alignment and associated data. Briefings in Bioinformatics. DOI:10.1093/bib/bbac222
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:seqinr':
##
## count
## The following objects are masked from 'package:Biostrings':
##
## collapse, intersect, setdiff, setequal, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following object is masked from 'package:XVector':
##
## slice
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following object is masked from 'package:ape':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:ggtree':
##
## inset
library(stringr)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pheatmap)
Use msa package to read .fasta file rather than ape because seq_lenth inconsistency
# read fasta file
aa.sequence = readAAStringSet("./data_used/TumE.fasta") # use msa package
# read annotation file
TumE_annotations = read.csv("./data_used/TumE_annotations.csv", check.names=FALSE)
1: Seq_name inconsistency between two files
A: Using gsub delete extra name after AA_Seq_name
2: Multiple na values after distance calculation induces errors in
calculating hierarchical tree
A: Omit na values by rows and columns
# remove redundant name
aa.sequence@ranges@NAMES = gsub('\\s.*', '', aa.sequence@ranges@NAMES) # Align seq_names across two files
# Convert Multiple Sequence Alignment
aa.sequence.alignment <- msa(aa.sequence) # Alignment between AA sequences
## use default substitution matrix
aa.alignment = msaConvert(aa.sequence.alignment, type="seqinr::alignment") # Convert into seqinr format
# calculate distance
dist_matrix = dist.alignment(aa.alignment, "identity") |> as.matrix() # as.dist calculation
write.csv(dist_matrix, 'distance-matrix.csv')
# remove NAs
dist_matrix_rm = dist_matrix[apply(dist_matrix, 1, function(x) sd(x)!=0),] |> na.omit() # Na value remove from rows
dist_matrix_rm = dist_matrix_rm[, rownames(dist_matrix_rm)] |> as.dist() # Then remove na values in columns
## if use hclust
## cluster_result = hclust(as.dist(dist_matrix_rm))
cluster_result = dist_matrix_rm
tree = bionj(cluster_result)
# visualization
data = tidy_msa(aa.sequence) # convert sequence fasta file into tidyverse format
## Warning in rbind(c("M", "V", "P", "V", "N", "D", "G", "R", "S", "P", "A", :
## number of columns of result is not a multiple of vector length (arg 1)
p1 = ggtree(tree) +
geom_tiplab(size=1.5) +
geom_facet(geom=geom_msa, data=data, panel='msa', font=NULL, color="Chemistry_AA") +
xlim_tree(0.6)
## ! The tree contained negative edge length. If you want to ignore the edge, you
## can set `options(ignore.negative.edge=TRUE)`, then re-run ggtree.
ggsave(p1, file="tree.pdf", width=9, height=45)
p1
Here in this plot, we can calculate 15 main chunks(clusters).
# use of circular tree
p2 = ggtree(tree, layout='circular',
ladderize=FALSE, size=0.8, branch.length="none",col="red") +
geom_tiplab2(hjust=-0.3, size=0.7) +
geom_tippoint(size=0.5,col="blue") +
geom_nodepoint(color="black", alpha=1/4, size=2) +
theme(legend.title=element_text(face="bold"),
legend.position="bottom",
legend.box="horizontal",
legend.text=element_text(size=rel(0.5)))
ggsave(p2, file="tree_circular.pdf", width=9, height=30)
p2
In the following code chunk, I labeled the name of each leaf using the annotation results
# add taxonomy and function from annotation data
aa_tax_func = data.frame(aa=tree[["tip.label"]]) %>% left_join(TumE_annotations, join_by(aa==uniprotAC))
aa_tax_func$all_tax = apply(aa_tax_func, 1, function(x){paste0(x, collapse=" | ")})
# rectangular tree
tree_tax = tree
tree_tax[["tip.label"]] = aa_tax_func$all_tax
p3 = ggtree(tree_tax, layout='rectangular', size=0.8, col="deepskyblue3") +
geom_tiplab(size=1.5, color="purple4", hjust=-0.05) +
geom_tippoint(size=1.5, color="deepskyblue3") +
geom_nodepoint(color="pink", alpha=1/4, size=5) +
xlim(0, 0.91) +
theme_tree2()
## ! The tree contained negative edge length. If you want to ignore the edge, you
## can set `options(ignore.negative.edge=TRUE)`, then re-run ggtree.
ggsave(p3, file="tree_rectangular.pdf", width=15, height=45)
p3
We can see the annotation results is labled in each leaf, based on this,
we can check if my cluster results aligned with your prediction.
Here I have marked 15 clusters, combine with your species and functional annotations, we see that sequence from same genus and have same fucntions do prone to be clustered togather
# protein tree
p4 = ggtree(tree, layout='rectangular', size=0.8, col="deepskyblue3") +
geom_tiplab(size=1.5, color="purple4", hjust=-0.05) +
geom_tippoint(size=1.5, color="deepskyblue3") +
geom_nodepoint(color="pink", alpha=1/4, size=5) +
theme_tree2()
## ! The tree contained negative edge length. If you want to ignore the edge, you
## can set `options(ignore.negative.edge=TRUE)`, then re-run ggtree.
ggsave(p4, file="tree_protein.pdf", width=9, height=45)
p4
# rectangular tree
tree_tax = tree
tree_tax[["tip.label"]] = aa_tax_func$species
p5 = ggtree(tree_tax, layout='rectangular', size=0.8, col="deepskyblue3") +
geom_tiplab(size=1.5, color="purple4", hjust=-0.05) +
geom_tippoint(size=1.5, color="deepskyblue3") +
geom_nodepoint(color="pink", alpha=1/4, size=5) +
xlim(0, 0.64) +
theme_tree2()
## ! The tree contained negative edge length. If you want to ignore the edge, you
## can set `options(ignore.negative.edge=TRUE)`, then re-run ggtree.
ggsave(p5, file="tree_species.pdf", width=9, height=45)
p5
# rectangular tree
tree_func = tree
tree_func[["tip.label"]] = aa_tax_func$`deepfri prediction1`
p6 = ggtree(tree_func, layout='rectangular', size=0.8, col="deepskyblue3") +
geom_tiplab(size=1.5, color="purple4", hjust=-0.05) +
geom_tippoint(size=1.5, color="deepskyblue3") +
geom_nodepoint(color="pink", alpha=1/4, size=5) +
xlim(0, 0.59) +
theme_tree2()
## ! The tree contained negative edge length. If you want to ignore the edge, you
## can set `options(ignore.negative.edge=TRUE)`, then re-run ggtree.
ggsave(p6, file="tree_function.pdf", width=9, height=45)
p6
After I marked 15 clusters by hand using p3, I found that there are a package “factoextra” can let me automatically mark clusters by diferrent colors
# find optimum k (abandoned)
#fviz_nbclust(as.matrix(dist_matrix_rm), FUNcluster=kmeans, method='silhouette')
print(fviz_nbclust(as.matrix(dist_matrix_rm), FUNcluster=kmeans, method='silhouette'))
# Based on this plot, we find at k=2, there is a very high in-group score, after 2 there is a downwards trend, but not big. Same happens to k=6 and k=8.
# Conclusion: No Elbow Found!!!
# cluster
cluster_result = hclust(as.dist(dist_matrix_rm), method="ward.D")
# visualization
# As mentioned before, set k = 15.
p8 = fviz_dend(cluster_result,
k=15, rect=TRUE, horiz=TRUE, cex=0.3,
main='', ylab='') +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggsave(plot=p8, filename='./hclust.pdf', width=8, height=35)
cluster_label = data.frame(color=p8[["plot_env"]][["data"]][["labels"]][["col"]],
label=p8[["plot_env"]][["data"]][["labels"]][["label"]])
write.csv(cluster_label, './cluster_label.csv', row.names=FALSE)
p8
After the clustering using standard hclust, it gave us a benefit of
convert hierarchical tree into standard tables.
Please check cluster_label.csv file, you will find that I have convert
the hierarachical tree information into groups.
We have in this .csv file 15 different groups, The colors column have 15
colors, indicates 15 different clusters, and in the label group are the
AA sequence names.
Amino acid sequences possessing the same function are more likely to be
clustered together than the genus to which they belong. However, it
cannot be said that the genus it belongs to does not contribute to the
clustering results. Therefore, for protein sequences that perform
similar functions across different species and are clustered together,
we might pay extra attention. Their structural similarity and functional
resemblance suggest they may follow similar evolutionary pathways.
However, beyond these two pieces of information, we have discovered many
sequences that do not originate from the same genus nor perform the same
function, yet they are still clustered together in the same group. This
observation could indicate that some protein functions may have the same
origin. For instance, enzymes like DNA binding and acting
on ester bonds are often categorized into the same cluster,
sometimes even into the smallest cluster, which might only contain these
two enzymes. Hence, we can also study the evolutionary similarities
between these two enzymes. This type of similarity study can be assisted
by using phyloP scores.
After my former cluster results, I’ve provided some possible next steps here that would allow us to continue our analysis.
For e.g. AA compositions
aa.sequence_df = as.data.frame(aa.sequence)
# interest amino acid
aa = 'A0A0F7PAR7' # For e.g
aa.interest = aa.sequence_df[aa, 'x']
aa.interest_freq = aa.interest %>%
stringr::str_split("") %>%
table() %>%
as.data.frame() %>%
set_colnames(c('Amino_Acid', 'Percentage'))
# barplot
p7 = ggplot(aa.interest_freq, aes(x=Amino_Acid, y=Percentage)) +
geom_bar(stat="identity", fill="skyblue") +
labs(title=paste0("Composition of ", aa), x="Amino Acid", y="Percentage")
ggsave(p7, file="amino acid composition.pdf", width=8, height=6)
p7
# subset groups
subset_cluster_label = cluster_label[cluster_label$color == "#F8766DFF", ]
# Subset dist_matrix
labels_to_keep = subset_cluster_label$label
# Subset the dist_matrix to keep only rows and columns in labels_to_keep
dist_matrix_subset = dist_matrix[labels_to_keep, labels_to_keep]
# For e.g. we can use heatmap to visualize the
p9 <- pheatmap(as.matrix(dist_matrix_subset),
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "complete",
cellwidth = 10,
cellheight = 10,
border_color = NA)
ggsave(p9, file="subgroup heatmap example.pdf", width=10, height=10)
p9
Firstly, from my experience, the gut microbiota is complex and
interdependent. Current technology cannot replicate the internal
environment of the gut in vitro, nor maintain such a microbial quantity
within the gut outside the body. Therefore, expecting to study the gut
microbiota using in vitro experiments is nearly impossible,
making in vivo experiments crucial. On the other hand, the
complex environment of the gut also makes it difficult to obtain direct
findings from human in vivo experiments. For instance, methods
used in bovine studies, such as performing fistula surgery on the omasum
and installing transparent materials, are not feasible in humans. So,
how do we obtain data from in vivo experiments and extensively
mine the results? Bioinformatics methods are almost the only way.
By using plasma and fecal samples, we can extract ample microbial information, and after comparison, we can identify microbes of interest and purify and culture them in vitro to validate our findings. For many microbes, such bioinformatics experiments are almost the only way to focus on them, especially for some anaerobic microbes. I have experience in purifying and passaging anaerobic microbes and am well aware of the difficulties and costs associated with culturing and researching these types of microbes. Without bioinformatics data, few would turn their attention to these microbes. However, anaerobic and facultative-anaerobic microbes constitute a significant proportion in the gut. Therefore, I believe that using bioinformatics methods in studying gut microbiota is especially important.
Returning to the task you assigned, the protein sequences provided in your proteomics data enable us to identify groups of structurally similar proteins. This similarity suggests functional and evolutionary relationships that are challenging to characterize through biological experiments alone. We all remember the errors caused by biotaxonomy based on physical characteristics, which, even today, profoundly, affects biological research and is being replaced by genomics-based classification. In this challenge, I utilized the annotation table you provided that was matched with online databases and tools, which is indicative of the revolution that bioinformatics brings to biological research. Additionally, unsupervised learning methods based on neural networks, like your developed SWISS-MODEL or Google’s AlphaFold2, have unveiled a new world in protein research. I still remember how difficult and costly protein structure research was during my undergraduate studies. Cryo-electron microscopy has played such a pivotal role in protein structure research at that time, and owning the latest and most advanced cryo-EM meant groundbreaking discoveries. However, bioinformatics research based on computing and large models has corrected this path (with all respects), simplifying operations and costs, and making large-scale protein structure prediction possible.
In your paper, you not only predicted structures but also inferred functions through structural similarities. It is so hard that without bioinformatics technologies, such discoveries would be incredibly costly and time-consuming. Therefore, this is what I consider to be the role of bioinformatics in biomedical science: to screen, filter hypothesis, make discoveries, and reduce redundant biological experiments through lower costs, shorter times, more statistical precision, and greater repeatability, finally leading to biological experiment validation. This accelerates the benefits to human society.